package spider.utils.html;

import java.io.IOException;
import java.io.StringReader;
import java.util.Iterator;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Attribute;
import org.jsoup.nodes.Attributes;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.wltea.analyzer.lucene.IKAnalyzer;

/**
 *  网页降噪处理
 * @author Administrator
 *
 */
public class HtmlContentParser {
	
	/**
	 * 清理不必要的标签属性、清理style、script下内容
	 * 不清理 标签中  href,src,class,id 属性值
	 * @param html
	 * @return
	 */
	public static String clean(String html){
		Document doc = Jsoup.parse(html);
		Elements elements = doc.getAllElements();
		Iterator<Element> iterator = elements.iterator();
		while(iterator.hasNext()){
			Element element = iterator.next();
			String tag = element.tagName().toLowerCase();
			
			Attributes attr = element.attributes();
			Iterator<Attribute> attrIterator = attr.iterator();
			while(attrIterator.hasNext()){
				Attribute at = attrIterator.next();
				if(tag.equals("img")){
					
				}else if(tag.equals("a")){
					//不清理 href style
					if(noClean(at.getKey())){
						element.removeAttr(at.getKey());
					}
				}else if(removeTag(tag)){
					if(noClean(at.getKey())){
						element.removeAttr(at.getKey());
					}
					cleanTag(element);
				}else{
					element.remove();
				}
			}
		}
		return doc.html();
	}
	/**
	 * 递归获取最终标签下的text 总长度
	 * @param element
	 * @return
	 */
	public static int textNum(Element element){
		int total = 0;
		Elements elements = element.children();
		if(elements.size() > 0){
			Iterator<Element> item = elements.iterator();
			while(item.hasNext()){
				Element ee = item.next();
				total += textNum(ee);
			}
		}else{
			total += element.text().length();
		}
		return total;
	}
	
	/**
	 * 统计关键字 含有的数量
	 * @param element
	 * @param str
	 * @return
	 */
	public static int analyzerNum(Element element,String str){
		String[] st = str.split(",");
		String content = element.text();
		int num = 0;
		for(int i = 0; i < st.length; i++){
			num += content.split(st[i]).length;
		}
		return num;
	}
	
	/**
	 * 将标题进行分词
	 * @param text
	 * @return
	 * @throws IOException
	 */
	public static String analyzer(String text) throws IOException{
		String ik = "";
		//创建分词对象  
        Analyzer anal=new IKAnalyzer(true);       
        StringReader reader=new StringReader(text);  
        //分词  
        TokenStream ts=anal.tokenStream("", reader);  
        CharTermAttribute term=ts.getAttribute(CharTermAttribute.class);  
        //遍历分词数据  
        while(ts.incrementToken()){  
            //System.out.print(term.toString()+"|");  
            ik += term.toString() + ",";
        }  
        reader.close();  
        return ik;
	}
	/**
	 *  判断tag 标签是否在str中
	 * @param tag 标签名
	 * @return
	 * html所有标签   a,abbr,acronym,address,applet,area,article,aside,audio,b,base,basefont,bdi,bdo,big,blockquote,body,br,button,canvas,caption,center,cite,code,col,colgroup,command,datalist,dd,del,details,dfn,dialog,dir,div,dl,dt,em,embed,fieldset,figcaption,figure,font,footer,form,frame,frameset,h1 - ,h6,head,header,hr,html,i,iframe,img,input,ins,kbd,keygen,label,legend,li,link,main,map,mark,menu,menuitem,meta,meter,nav,noframes,noscript,object,ol,optgroup,option,output,p,param,pre,progress,q,rp,rt,ruby,s,samp,script,section,select,small,source,span,strike,strong,style,sub,summary,sup,table,tbody,td,textarea,tfoot,th,thead,time,title,tr,track,tt,u,ul,var,video,wbr
	 */
	private static boolean removeTag(String tag){
		String str = "abbr,acronym,address,applet,area,article,aside,audio,b,base,basefont," +
				"bdi,bdo,big,blockquote,body,br,button,canvas,caption,center,cite,code,col," +
				"colgroup,command,datalist,dd,del,details,dfn,dialog,dir,div,dl,dt,em,embed," +
				"fieldset,figcaption,figure,font,footer,form,frame,frameset,h1,h2,h3,h4,h5,h6," +
				"head,header,hr,html,i,iframe,input,ins,kbd,keygen,label,legend," +
				"li,link,main,map,mark,menu,menuitem,meta,meter,nav,noframes,noscript,object," +
				"ol,optgroup,option,output,p,param,pre,progress,q,rp,rt,ruby,s,samp,script," +
				"section,select,small,source,span,strike,strong,style,sub,summary,sup,table,tbody," +
				"td,textarea,tfoot,th,thead,time,title,tr,track,tt,u,ul,var,video,wbr";
		
		if(str.contains(tag)){
			return true;
		}else{
			return false;
		}
	}
	/**
	 * 无须清理的内容
	 * @param attr
	 * @return
	 */
	private static boolean noClean(String attr){
		String str = "href,src,class,id";
		if(str.contains(attr)){
			return false;
		}else{
			return true;
		}
	}
	
	/**
	 * 递归清除 Element 中 style/script 的数据
	 * @param element
	 */
	private static void cleanTag(Element element){
		Elements elements = element.children();
		if(elements.size() > 0){
			Iterator<Element> item = elements.iterator();
			while(item.hasNext()){
				Element ee = item.next();
				if(ee.tagName().equals("style") || ee.tagName().equals("script")){
					ee.empty();
				}else{
					cleanTag(ee);
				}
			}
		}else{
			return;
		}
	}
}
